Appendix


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sea
%matplotlib inline

In [10]:
import json

fn = 'RubygemDigger--Steps--GenerateJsonForLastVersions--1.data.json'
with open(fn) as f: data = json.load(f)
simple_analysis = data["simple_analysis"]
keys = simple_analysis.keys()
d = data["data"]

df = pd.io.json.json_normalize(d)

In [5]:
def error_rate(field):
    return average_stddev_percent(field) - abs(improved_rate(field))

def improved_rate(field):
    return (field["abandoned"]["avg"] - field["maintained"]["avg"]) * 100 / field["maintained"]["avg"]
def stddev_percent(data):
    return data["stddev"] * 100 / data["avg"]
def average_stddev_percent(field):
    return (stddev_percent(field["maintained"]) + stddev_percent(field["abandoned"])) / 2

In [6]:
def print_label(name, f):
    print("   %s:\tavg: %6.3f\tstddev: %6.3f\tstddev percent:%6.1f%%" %(name, f[name]["avg"], f[name]["stddev"], stddev_percent(f[name])))
    
impacts = {k: error_rate(simple_analysis[k]) for k in keys}
for key in (sorted(impacts.items(), key=lambda x: x[1]))[:10]:
    k = key[0]
    f = simple_analysis[k]
    print("Metrics: %s\tImprovement:%6.1f%%\tError rate:%6.1f%%" % (key[0],improved_rate(f), key[1]))
    for name in ('maintained', 'abandoned', 'with_issues'):
        print_label(name, f)


Metrics: style_	Improvement:  24.8%	Error rate:  26.0%
   maintained:	avg: 48.401	stddev: 26.349	stddev percent:  54.4%
   abandoned:	avg: 60.395	stddev: 28.462	stddev percent:  47.1%
   with_issues:	avg: 56.487	stddev: 30.394	stddev percent:  53.8%
Metrics: avg_ccn	Improvement:   3.2%	Error rate:  29.2%
   maintained:	avg:  1.757	stddev:  0.581	stddev percent:  33.1%
   abandoned:	avg:  1.813	stddev:  0.573	stddev percent:  31.6%
   with_issues:	avg:  1.762	stddev:  0.370	stddev percent:  21.0%
Metrics: lint_	Improvement:  26.6%	Error rate:  32.5%
   maintained:	avg:  6.688	stddev:  4.296	stddev percent:  64.2%
   abandoned:	avg:  8.467	stddev:  4.570	stddev percent:  54.0%
   with_issues:	avg:  6.913	stddev:  3.918	stddev percent:  56.7%
Metrics: metrics_	Improvement:  20.1%	Error rate:  38.7%
   maintained:	avg:  6.766	stddev:  4.183	stddev percent:  61.8%
   abandoned:	avg:  8.123	stddev:  4.529	stddev percent:  55.8%
   with_issues:	avg:  7.851	stddev:  5.327	stddev percent:  67.8%
Metrics: reek_total	Improvement:   8.0%	Error rate:  47.3%
   maintained:	avg: 101.423	stddev: 57.340	stddev percent:  56.5%
   abandoned:	avg: 109.536	stddev: 59.189	stddev percent:  54.0%
   with_issues:	avg: 131.114	stddev: 46.662	stddev percent:  35.6%
Metrics: TooManyStatements	Improvement:  10.3%	Error rate:  57.8%
   maintained:	avg: 12.049	stddev:  7.957	stddev percent:  66.0%
   abandoned:	avg: 13.296	stddev:  9.334	stddev percent:  70.2%
   with_issues:	avg: 14.618	stddev:  7.688	stddev percent:  52.6%
Metrics: warning_count	Improvement:  10.8%	Error rate:  65.3%
   maintained:	avg:  4.686	stddev:  3.456	stddev percent:  73.8%
   abandoned:	avg:  5.190	stddev:  4.069	stddev percent:  78.4%
   with_issues:	avg:  5.537	stddev:  3.433	stddev percent:  62.0%
Metrics: metrics_perceivedcomplexity	Improvement:  26.3%	Error rate:  68.6%
   maintained:	avg:  0.770	stddev:  0.765	stddev percent:  99.3%
   abandoned:	avg:  0.973	stddev:  0.881	stddev percent:  90.5%
   with_issues:	avg:  1.106	stddev:  0.884	stddev percent:  79.9%
Metrics: DuplicateMethodCall	Improvement:  11.5%	Error rate:  68.7%
   maintained:	avg: 26.379	stddev: 21.839	stddev percent:  82.8%
   abandoned:	avg: 29.420	stddev: 22.845	stddev percent:  77.7%
   with_issues:	avg: 36.461	stddev: 22.650	stddev percent:  62.1%
Metrics: metrics_cyclomaticcomplexity	Improvement:  24.8%	Error rate:  68.7%
   maintained:	avg:  0.837	stddev:  0.817	stddev percent:  97.7%
   abandoned:	avg:  1.044	stddev:  0.933	stddev percent:  89.3%
   with_issues:	avg:  1.036	stddev:  1.074	stddev percent: 103.7%

In [7]:
df = df[df["stat.avg_nloc"]<20]

In [12]:
abdf = df[df["label"]=="abandoned"]
mtdf = df[df["label"]!="abandoned"]
x = np.random.randn(100, 2)
field = 'stat.style_'
for k in (sorted(impacts.items(), key=lambda x: x[1]))[:5]:
    field='stat.'+k[0]
    plt.figure(field)
    plt.title(field)
    means = np.array([abdf[field].mean(), mtdf[field].mean()])
    maxes = np.array([abdf[field].max(), mtdf[field].max()])
    mins = np.array([abdf[field].min(), mtdf[field].min()])
    std = np.array([abdf[field].std(), mtdf[field].std()])


    # create stacked errorbars:
    plt.errorbar(np.arange(2), means, std, fmt='ok', ecolor=['red', 'green'], lw=20)
    plt.errorbar(np.arange(2), means, [means - mins, maxes - means],
                 fmt='.k', ecolor=['red', 'green'], lw=2)
    plt.xticks(np.arange(2), ['abandoned', 'maintained'])
    plt.xlim(-1, 2)



In [ ]: